# ML for continuous target variable
# 1. Tree Based Models
# 2. Regression
# 3. Neural Networks
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Classification/code/carga_librerias.R')
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/f_partition.R')
source('/Users/ssobrinou/IE/Advanced/2019_Advanced/Regression/code/regression_metrics.R')
whole_data<-f_partition(df=fread('/Users/ssobrinou/IE/Advanced/2019_Advanced/Datasets/data_automobile_ready.csv'),
test_proportion = 0.2,
seed = 872367823)
str(whole_data)
## List of 2
## $ train:Classes 'data.table' and 'data.frame': 156 obs. of 31 variables:
## ..$ fuel_gas : int [1:156] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : int [1:156] 0 0 0 0 0 0 1 0 0 0 ...
## ..$ doors_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : int [1:156] 0 1 0 1 1 1 1 1 0 0 ...
## ..$ body_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : int [1:156] 1 0 0 0 0 0 0 1 1 1 ...
## ..$ body_wagon : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ drive_rwd : int [1:156] 0 0 0 1 1 0 1 1 0 0 ...
## ..$ engine_loc_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
## ..$ length : num [1:156] 167 145 173 176 169 ...
## ..$ width : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
## ..$ height : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
## ..$ weight : int [1:156] 1950 1819 2324 2714 2204 2221 2818 2169 2385 2010 ...
## ..$ engine_type_others: int [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ cyl_others : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ cyl_six : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ engine_size : int [1:156] 91 92 120 146 98 109 156 98 108 92 ...
## ..$ fuel_sys_idi : int [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : int [1:156] 0 0 0 1 0 1 0 0 0 0 ...
## ..$ fuel_sys_others : int [1:156] 0 1 0 0 0 0 1 0 0 1 ...
## ..$ bore : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
## ..$ stroke : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
## ..$ compr_ratio : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
## ..$ hp : int [1:156] 68 76 97 116 70 90 145 70 82 76 ...
## ..$ peak_rpm : int [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
## ..$ city_mpg : int [1:156] 31 31 27 24 29 24 19 29 24 30 ...
## ..$ high_mpg : int [1:156] 38 38 34 30 34 29 24 34 25 34 ...
## ..$ price : int [1:156] 7395 6855 8949 11549 8238 9980 12764 8058 9233 7295 ...
## ..$ make_agg_toyota : int [1:156] 0 0 0 1 1 0 0 1 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ test :Classes 'data.table' and 'data.frame': 39 obs. of 31 variables:
## ..$ fuel_gas : int [1:39] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : int [1:39] 0 0 0 1 1 0 0 1 1 0 ...
## ..$ body_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : int [1:39] 1 0 1 0 0 0 1 0 0 1 ...
## ..$ body_wagon : int [1:39] 0 1 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : int [1:39] 1 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_rwd : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_loc_others : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
## ..$ length : num [1:39] 177 193 189 141 157 ...
## ..$ width : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
## ..$ height : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
## ..$ weight : int [1:39] 2824 2954 3230 1488 1876 1967 1989 1940 2289 2304 ...
## ..$ engine_type_others: int [1:39] 0 0 0 1 0 0 0 0 0 0 ...
## ..$ cyl_others : int [1:39] 1 1 0 1 0 0 0 0 0 0 ...
## ..$ cyl_six : int [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_size : int [1:39] 136 136 209 61 90 90 90 92 110 110 ...
## ..$ fuel_sys_idi : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : int [1:39] 1 1 1 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_others : int [1:39] 0 0 0 0 0 0 0 1 1 1 ...
## ..$ bore : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
## ..$ stroke : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
## ..$ compr_ratio : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
## ..$ hp : int [1:39] 115 110 182 48 68 68 68 76 86 86 ...
## ..$ peak_rpm : int [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
## ..$ city_mpg : int [1:39] 18 19 16 47 31 31 31 30 27 27 ...
## ..$ high_mpg : int [1:39] 22 25 22 53 38 38 38 34 33 33 ...
## ..$ price : int [1:39] 17450 18920 30760 5151 6377 6229 6692 6529 9095 8845 ...
## ..$ make_agg_toyota : int [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
whole_data<-lapply(whole_data, function(x){
return(x[, names(x)[sapply(x,is.integer)]:=lapply(.SD, as.numeric), .SDcols=sapply(x,is.integer)])
})
str(whole_data)
## List of 2
## $ train:Classes 'data.table' and 'data.frame': 156 obs. of 31 variables:
## ..$ fuel_gas : num [1:156] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : num [1:156] 0 0 0 0 0 0 1 0 0 0 ...
## ..$ doors_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : num [1:156] 0 1 0 1 1 1 1 1 0 0 ...
## ..$ body_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : num [1:156] 1 0 0 0 0 0 0 1 1 1 ...
## ..$ body_wagon : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ drive_rwd : num [1:156] 0 0 0 1 1 0 1 1 0 0 ...
## ..$ engine_loc_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:156] 93.1 86.6 97.2 98.4 94.5 94.5 95.9 94.5 97 96.5 ...
## ..$ length : num [1:156] 167 145 173 176 169 ...
## ..$ width : num [1:156] 64.2 63.9 65.2 65.6 64 64 66.3 64 65.4 64 ...
## ..$ height : num [1:156] 54.1 50.8 54.7 52 52.6 51.4 50.2 52.6 54.3 54.5 ...
## ..$ weight : num [1:156] 1950 1819 2324 2714 2204 ...
## ..$ engine_type_others: num [1:156] 0 0 0 0 0 0 0 0 1 0 ...
## ..$ cyl_others : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ cyl_six : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ engine_size : num [1:156] 91 92 120 146 98 109 156 98 108 92 ...
## ..$ fuel_sys_idi : num [1:156] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : num [1:156] 0 0 0 1 0 1 0 0 0 0 ...
## ..$ fuel_sys_others : num [1:156] 0 1 0 0 0 0 1 0 0 1 ...
## ..$ bore : num [1:156] 3.08 2.91 3.33 3.62 3.19 3.19 3.59 3.19 3.62 2.91 ...
## ..$ stroke : num [1:156] 3.15 3.41 3.47 3.5 3.03 3.4 3.86 3.03 2.64 3.41 ...
## ..$ compr_ratio : num [1:156] 9 9.2 8.5 9.3 9 8.5 7 9 9 9.2 ...
## ..$ hp : num [1:156] 68 76 97 116 70 90 145 70 82 76 ...
## ..$ peak_rpm : num [1:156] 5000 6000 5200 4800 4800 5500 5000 4800 4800 6000 ...
## ..$ city_mpg : num [1:156] 31 31 27 24 29 24 19 29 24 30 ...
## ..$ high_mpg : num [1:156] 38 38 34 30 34 29 24 34 25 34 ...
## ..$ price : num [1:156] 7395 6855 8949 11549 8238 ...
## ..$ make_agg_toyota : num [1:156] 0 0 0 1 1 0 0 1 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
## $ test :Classes 'data.table' and 'data.frame': 39 obs. of 31 variables:
## ..$ fuel_gas : num [1:39] 1 1 1 1 1 1 1 1 1 1 ...
## ..$ aspiration_turbo : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ doors_two : num [1:39] 0 0 0 1 1 0 0 1 1 0 ...
## ..$ body_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ body_sedan : num [1:39] 1 0 1 0 0 0 1 0 0 1 ...
## ..$ body_wagon : num [1:39] 0 1 0 0 0 0 0 0 0 0 ...
## ..$ drive_others : num [1:39] 1 0 0 0 0 0 0 0 0 0 ...
## ..$ drive_rwd : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_loc_others : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ wheel_base : num [1:39] 99.4 105.8 103.5 88.4 93.7 ...
## ..$ length : num [1:39] 177 193 189 141 157 ...
## ..$ width : num [1:39] 66.4 71.4 66.9 60.3 63.8 63.8 63.8 64 65.2 65.2 ...
## ..$ height : num [1:39] 54.3 55.7 55.7 53.2 50.8 50.6 50.6 52.6 53.3 54.1 ...
## ..$ weight : num [1:39] 2824 2954 3230 1488 1876 ...
## ..$ engine_type_others: num [1:39] 0 0 0 1 0 0 0 0 0 0 ...
## ..$ cyl_others : num [1:39] 1 1 0 1 0 0 0 0 0 0 ...
## ..$ cyl_six : num [1:39] 0 0 1 0 0 0 0 0 0 0 ...
## ..$ engine_size : num [1:39] 136 136 209 61 90 90 90 92 110 110 ...
## ..$ fuel_sys_idi : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_mpfi : num [1:39] 1 1 1 0 0 0 0 0 0 0 ...
## ..$ fuel_sys_others : num [1:39] 0 0 0 0 0 0 0 1 1 1 ...
## ..$ bore : num [1:39] 3.19 3.19 3.62 2.91 2.97 2.97 2.97 2.91 3.15 3.15 ...
## ..$ stroke : num [1:39] 3.4 3.4 3.39 3.03 3.23 3.23 3.23 3.41 3.58 3.58 ...
## ..$ compr_ratio : num [1:39] 8 8.5 8 9.5 9.4 9.4 9.4 9.2 9 9 ...
## ..$ hp : num [1:39] 115 110 182 48 68 68 68 76 86 86 ...
## ..$ peak_rpm : num [1:39] 5500 5500 5400 5100 5500 5500 5500 6000 5800 5800 ...
## ..$ city_mpg : num [1:39] 18 19 16 47 31 31 31 30 27 27 ...
## ..$ high_mpg : num [1:39] 22 25 22 53 38 38 38 34 33 33 ...
## ..$ price : num [1:39] 17450 18920 30760 5151 6377 ...
## ..$ make_agg_toyota : num [1:39] 0 0 0 0 0 0 0 0 0 0 ...
## ..- attr(*, ".internal.selfref")=<externalptr>
# we start defining a formula
formula<-as.formula(price~.) # price against all other variables
#### 1.1 Base R Partitioning Tree
library(rpart)
library(rpart.plot)
tree_0<-rpart(formula = formula, data = whole_data$train, method = 'anova', model=TRUE)
print(tree_0)
## n= 156
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 156 9507260000 13064.040
## 2) engine_size< 182 146 3229339000 11427.030
## 4) weight< 2544 89 442823600 8395.393
## 8) length< 172.7 63 86829530 7401.762 *
## 9) length>=172.7 26 143078300 10803.040 *
## 5) weight>=2544 57 691334500 16160.630
## 10) width< 68.6 50 491555400 15542.020
## 20) hp< 118 27 142082300 13943.780 *
## 21) hp>=118 23 199542200 17418.220
## 42) stroke>=3.31 10 21436830 14821.100 *
## 43) stroke< 3.31 13 58770460 19416.000 *
## 11) width>=68.6 7 43973520 20579.290 *
## 3) engine_size>=182 10 174348500 36964.500 *
summary(tree_0)
## Call:
## rpart(formula = formula, data = whole_data$train, method = "anova",
## model = TRUE)
## n= 156
##
## CP nsplit rel error xerror xstd
## 1 0.64199066 0 1.00000000 1.0061701 0.19634783
## 2 0.22037697 1 0.35800934 0.4096693 0.05526871
## 3 0.02239507 2 0.13763236 0.1966299 0.03396699
## 4 0.01638806 3 0.11523729 0.1784056 0.03210153
## 5 0.01577015 4 0.09884923 0.1801067 0.02947649
## 6 0.01255198 5 0.08307909 0.1762945 0.02932287
## 7 0.01000000 6 0.07052711 0.1606481 0.02614540
##
## Variable importance
## engine_size weight hp city_mpg width length
## 22 18 16 16 9 9
## high_mpg drive_rwd
## 5 4
##
## Node number 1: 156 observations, complexity param=0.6419907
## mean=13064.04, MSE=6.094397e+07
## left son=2 (146 obs) right son=3 (10 obs)
## Primary splits:
## engine_size < 182 to the left, improve=0.6419907, (0 missing)
## city_mpg < 17.5 to the right, improve=0.5479412, (0 missing)
## hp < 175.5 to the left, improve=0.5262288, (0 missing)
## weight < 2697.5 to the left, improve=0.5036809, (0 missing)
## high_mpg < 28.5 to the right, improve=0.4750277, (0 missing)
## Surrogate splits:
## weight < 3495 to the left, agree=0.981, adj=0.7, (0 split)
## hp < 175.5 to the left, agree=0.981, adj=0.7, (0 split)
## city_mpg < 16.5 to the right, agree=0.981, adj=0.7, (0 split)
## length < 199.05 to the left, agree=0.968, adj=0.5, (0 split)
## width < 69.25 to the left, agree=0.968, adj=0.5, (0 split)
##
## Node number 2: 146 observations, complexity param=0.220377
## mean=11427.03, MSE=2.211876e+07
## left son=4 (89 obs) right son=5 (57 obs)
## Primary splits:
## weight < 2544 to the left, improve=0.6487956, (0 missing)
## high_mpg < 28.5 to the right, improve=0.5943413, (0 missing)
## engine_size < 126 to the left, improve=0.5689015, (0 missing)
## hp < 94.5 to the left, improve=0.5402361, (0 missing)
## city_mpg < 23.5 to the right, improve=0.4989328, (0 missing)
## Surrogate splits:
## high_mpg < 28.5 to the right, agree=0.911, adj=0.772, (0 split)
## engine_size < 126 to the left, agree=0.897, adj=0.737, (0 split)
## hp < 104 to the left, agree=0.877, adj=0.684, (0 split)
## city_mpg < 22 to the right, agree=0.863, adj=0.649, (0 split)
## drive_rwd < 0.5 to the left, agree=0.856, adj=0.632, (0 split)
##
## Node number 3: 10 observations
## mean=36964.5, MSE=1.743485e+07
##
## Node number 4: 89 observations, complexity param=0.02239507
## mean=8395.393, MSE=4975546
## left son=8 (63 obs) right son=9 (26 obs)
## Primary splits:
## length < 172.7 to the left, improve=0.4808139, (0 missing)
## weight < 2287.5 to the left, improve=0.4666525, (0 missing)
## wheel_base < 98.6 to the left, improve=0.4051903, (0 missing)
## width < 64.5 to the left, improve=0.4023945, (0 missing)
## hp < 83 to the left, improve=0.3833154, (0 missing)
## Surrogate splits:
## wheel_base < 97.85 to the left, agree=0.921, adj=0.731, (0 split)
## weight < 2301 to the left, agree=0.910, adj=0.692, (0 split)
## engine_size < 115.5 to the left, agree=0.888, adj=0.615, (0 split)
## width < 65.55 to the left, agree=0.876, adj=0.577, (0 split)
## bore < 3.29 to the left, agree=0.831, adj=0.423, (0 split)
##
## Node number 5: 57 observations, complexity param=0.01638806
## mean=16160.63, MSE=1.212868e+07
## left son=10 (50 obs) right son=11 (7 obs)
## Primary splits:
## width < 68.6 to the left, improve=0.2253693, (0 missing)
## hp < 118 to the left, improve=0.2057451, (0 missing)
## cyl_six < 0.5 to the left, improve=0.1899323, (0 missing)
## wheel_base < 100.8 to the left, improve=0.1879356, (0 missing)
## weight < 2697.5 to the left, improve=0.1791694, (0 missing)
## Surrogate splits:
## wheel_base < 108.55 to the left, agree=0.895, adj=0.143, (0 split)
## cyl_others < 0.5 to the left, agree=0.895, adj=0.143, (0 split)
##
## Node number 8: 63 observations
## mean=7401.762, MSE=1378246
##
## Node number 9: 26 observations
## mean=10803.04, MSE=5503013
##
## Node number 10: 50 observations, complexity param=0.01577015
## mean=15542.02, MSE=9831109
## left son=20 (27 obs) right son=21 (23 obs)
## Primary splits:
## hp < 118 to the left, improve=0.3050132, (0 missing)
## engine_size < 162.5 to the left, improve=0.2531597, (0 missing)
## cyl_six < 0.5 to the left, improve=0.2348309, (0 missing)
## weight < 2697.5 to the left, improve=0.1646924, (0 missing)
## peak_rpm < 4375 to the right, improve=0.1423281, (0 missing)
## Surrogate splits:
## city_mpg < 20.5 to the right, agree=0.86, adj=0.696, (0 split)
## engine_size < 154 to the left, agree=0.84, adj=0.652, (0 split)
## cyl_six < 0.5 to the left, agree=0.76, adj=0.478, (0 split)
## high_mpg < 26.5 to the right, agree=0.76, adj=0.478, (0 split)
## height < 54.85 to the right, agree=0.74, adj=0.435, (0 split)
##
## Node number 11: 7 observations
## mean=20579.29, MSE=6281932
##
## Node number 20: 27 observations
## mean=13943.78, MSE=5262308
##
## Node number 21: 23 observations, complexity param=0.01255198
## mean=17418.22, MSE=8675749
## left son=42 (10 obs) right son=43 (13 obs)
## Primary splits:
## stroke < 3.31 to the right, improve=0.5980435, (0 missing)
## high_mpg < 24.5 to the left, improve=0.3815231, (0 missing)
## height < 54.2 to the left, improve=0.3073280, (0 missing)
## compr_ratio < 7.65 to the left, improve=0.2493117, (0 missing)
## body_sedan < 0.5 to the left, improve=0.2394175, (0 missing)
## Surrogate splits:
## height < 54.2 to the left, agree=0.826, adj=0.6, (0 split)
## fuel_sys_mpfi < 0.5 to the left, agree=0.783, adj=0.5, (0 split)
## fuel_sys_others < 0.5 to the right, agree=0.783, adj=0.5, (0 split)
## bore < 3.29 to the left, agree=0.783, adj=0.5, (0 split)
## width < 66.7 to the left, agree=0.739, adj=0.4, (0 split)
##
## Node number 42: 10 observations
## mean=14821.1, MSE=2143683
##
## Node number 43: 13 observations
## mean=19416, MSE=4520804
rpart.plot(tree_0, digits = 4,type = 2,box.palette = 'Gn')

test_tree<-predict(tree_0, newdata = whole_data$test,type = 'vector')
df_pred<-whole_data$test[, .(id=1:.N,price, test_tree)]
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree: num 13944 20579 36964 7402 7402 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red'))

#### 1.2 Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
rf_0<-randomForest(formula=formula, data=whole_data$train)
print(rf_0)
##
## Call:
## randomForest(formula = formula, data = whole_data$train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 10
##
## Mean of squared residuals: 5619661
## % Var explained: 90.78
test_rf<-predict(rf_0, newdata = whole_data$test, type='response')
df_pred<-cbind(df_pred, test_rf)
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 4 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree: num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue'))

#### 1.3 Boosting Tree
library(xgboost)
# for this algorithm we need to convert to a matrix first
#
xgb_0<-xgboost(booster='gbtree',
data=as.matrix(whole_data$train[, !'price', with=F]),
label=whole_data$train$price,
nrounds = 50,
objective='reg:linear')
## [1] train-rmse:11027.429688
## [2] train-rmse:8065.855469
## [3] train-rmse:5963.436523
## [4] train-rmse:4451.936523
## [5] train-rmse:3378.894043
## [6] train-rmse:2617.100586
## [7] train-rmse:2044.587769
## [8] train-rmse:1629.894897
## [9] train-rmse:1320.864014
## [10] train-rmse:1072.105591
## [11] train-rmse:892.479187
## [12] train-rmse:756.755859
## [13] train-rmse:648.792053
## [14] train-rmse:577.898621
## [15] train-rmse:516.197937
## [16] train-rmse:467.336212
## [17] train-rmse:436.486969
## [18] train-rmse:406.966827
## [19] train-rmse:381.598633
## [20] train-rmse:362.346283
## [21] train-rmse:348.406586
## [22] train-rmse:339.302521
## [23] train-rmse:323.668304
## [24] train-rmse:315.626312
## [25] train-rmse:308.365356
## [26] train-rmse:304.067474
## [27] train-rmse:299.807007
## [28] train-rmse:294.491669
## [29] train-rmse:289.260681
## [30] train-rmse:286.898315
## [31] train-rmse:275.956177
## [32] train-rmse:274.439362
## [33] train-rmse:266.735413
## [34] train-rmse:261.335022
## [35] train-rmse:255.965347
## [36] train-rmse:254.613876
## [37] train-rmse:250.018661
## [38] train-rmse:247.678406
## [39] train-rmse:245.217163
## [40] train-rmse:240.946564
## [41] train-rmse:238.491440
## [42] train-rmse:237.925644
## [43] train-rmse:237.266998
## [44] train-rmse:236.705490
## [45] train-rmse:234.588348
## [46] train-rmse:233.108276
## [47] train-rmse:231.601913
## [48] train-rmse:231.403214
## [49] train-rmse:230.231857
## [50] train-rmse:229.325745
print(xgb_0)
## ##### xgb.Booster
## raw: 80.6 Kb
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, booster = "gbtree", objective = "reg:linear")
## params (as set within xgb.train):
## booster = "gbtree", objective = "reg:linear", silent = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 30
## niter: 50
## nfeatures : 30
## evaluation_log:
## iter train_rmse
## 1 11027.4297
## 2 8065.8555
## ---
## 49 230.2319
## 50 229.3257
test_xgb<-predict(xgb_0, newdata = as.matrix(whole_data$test[, !'price', with=F]), type='response')
df_pred<-cbind(df_pred, test_xgb)
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree: num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen'))

#### 2.1 Regression with StepWise feature selection
library(MASS)
lm_0<-stepAIC(lm(formula = formula,
data=whole_data$train),
trace=F)
summary(lm_0)
##
## Call:
## lm(formula = price ~ fuel_gas + body_others + body_wagon + engine_loc_others +
## wheel_base + weight + engine_type_others + cyl_others + cyl_six +
## engine_size + fuel_sys_mpfi + stroke + compr_ratio + peak_rpm +
## city_mpg + make_agg_toyota, data = whole_data$train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5137.6 -1508.6 -169.6 1203.3 10249.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8895.278 10865.034 0.819 0.414355
## fuel_gas -19493.083 5406.508 -3.605 0.000433 ***
## body_others 2236.205 977.614 2.287 0.023682 *
## body_wagon -1264.349 705.093 -1.793 0.075121 .
## engine_loc_others 11125.224 2172.291 5.121 0.000000992671849 ***
## wheel_base 172.141 61.819 2.785 0.006107 **
## weight 3.787 1.594 2.377 0.018840 *
## engine_type_others -4194.023 653.879 -6.414 0.000000002065665 ***
## cyl_others 3458.041 1114.765 3.102 0.002328 **
## cyl_six 4392.530 849.152 5.173 0.000000788665742 ***
## engine_size 114.357 13.924 8.213 0.000000000000134 ***
## fuel_sys_mpfi 1349.560 611.260 2.208 0.028895 *
## stroke -5585.578 872.272 -6.403 0.000000002179275 ***
## compr_ratio -1308.497 389.470 -3.360 0.001007 **
## peak_rpm 1.545 0.628 2.460 0.015139 *
## city_mpg 154.041 83.684 1.841 0.067790 .
## make_agg_toyota -2195.249 606.064 -3.622 0.000409 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2359 on 139 degrees of freedom
## Multiple R-squared: 0.9186, Adjusted R-squared: 0.9093
## F-statistic: 98.1 on 16 and 139 DF, p-value: < 0.00000000000000022
test_lm<-predict(lm_0, newdata = whole_data$test)
df_pred<-cbind(df_pred, test_lm)
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 6 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree: num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## $ test_lm : num 19378 19208 30497 2257 5859 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen','orange'))

#### 2.2 Regression with regularization
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
glmnet_0<-cv.glmnet(x = data.matrix(whole_data$train[, !'price']),
y = whole_data$train[['price']],
family = 'gaussian',
alpha=1)
glmnet_0<-glmnet(x = data.matrix(whole_data$train[, !'price']),
y = whole_data$train[['price']],
family = 'gaussian',
alpha=1, lambda = glmnet_0$lambda.min)
glmnet_0
##
## Call: glmnet(x = data.matrix(whole_data$train[, !"price"]), y = whole_data$train[["price"]], family = "gaussian", alpha = 1, lambda = glmnet_0$lambda.min)
##
## Df %Dev Lambda
## [1,] 22 0.9146 59.9
print(glmnet_0)
##
## Call: glmnet(x = data.matrix(whole_data$train[, !"price"]), y = whole_data$train[["price"]], family = "gaussian", alpha = 1, lambda = glmnet_0$lambda.min)
##
## Df %Dev Lambda
## [1,] 22 0.9146 59.9
test_glmnet<-predict(glmnet_0, newx = as.matrix(whole_data$test[, !'price']),s = 0)
df_pred<-cbind(df_pred, test_glmnet=test_glmnet[,1])
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 7 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree : num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## $ test_lm : num 19378 19208 30497 2257 5859 ...
## $ test_glmnet: num 17943 19133 28520 442 5917 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray'))

#### 2.3 Boosting Regression
library(xgboost)
# for this algorithm we need to convert to a matrix first
#
xgb_reg_0<-xgboost(booster='gblinear',
data=as.matrix(whole_data$train[, !'price', with=F]),
label=whole_data$train$price,
nrounds = 50,
objective='reg:linear')
## [1] train-rmse:4828.959961
## [2] train-rmse:4155.766113
## [3] train-rmse:3926.983643
## [4] train-rmse:3810.656250
## [5] train-rmse:3729.830078
## [6] train-rmse:3664.711182
## [7] train-rmse:3608.852783
## [8] train-rmse:3559.350098
## [9] train-rmse:3514.524658
## [10] train-rmse:3473.294922
## [11] train-rmse:3434.931885
## [12] train-rmse:3398.937988
## [13] train-rmse:3364.964600
## [14] train-rmse:3332.764648
## [15] train-rmse:3302.157715
## [16] train-rmse:3273.004395
## [17] train-rmse:3245.196533
## [18] train-rmse:3218.645020
## [19] train-rmse:3193.274658
## [20] train-rmse:3169.019775
## [21] train-rmse:3145.822021
## [22] train-rmse:3123.627686
## [23] train-rmse:3102.388184
## [24] train-rmse:3082.056885
## [25] train-rmse:3062.590820
## [26] train-rmse:3043.950195
## [27] train-rmse:3026.094482
## [28] train-rmse:3008.989502
## [29] train-rmse:2992.599121
## [30] train-rmse:2976.889893
## [31] train-rmse:2961.831787
## [32] train-rmse:2947.393311
## [33] train-rmse:2933.547119
## [34] train-rmse:2920.264893
## [35] train-rmse:2907.520508
## [36] train-rmse:2895.290039
## [37] train-rmse:2883.550049
## [38] train-rmse:2872.277344
## [39] train-rmse:2861.450684
## [40] train-rmse:2851.049316
## [41] train-rmse:2841.054199
## [42] train-rmse:2831.446289
## [43] train-rmse:2822.208984
## [44] train-rmse:2813.324707
## [45] train-rmse:2804.776855
## [46] train-rmse:2796.551270
## [47] train-rmse:2788.633057
## [48] train-rmse:2781.008057
## [49] train-rmse:2773.662598
## [50] train-rmse:2766.585449
print(xgb_reg_0)
## ##### xgb.Booster
## raw: 520 bytes
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, booster = "gblinear", objective = "reg:linear")
## params (as set within xgb.train):
## booster = "gblinear", objective = "reg:linear", silent = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 30
## niter: 50
## nfeatures : 30
## evaluation_log:
## iter train_rmse
## 1 4828.960
## 2 4155.766
## ---
## 49 2773.663
## 50 2766.585
test_xgb_reg<-predict(xgb_reg_0, newdata = as.matrix(whole_data$test[, !'price', with=F]), type='response')
df_pred<-cbind(df_pred, test_xgb_reg)
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 8 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree : num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## $ test_lm : num 19378 19208 30497 2257 5859 ...
## $ test_glmnet : num 17943 19133 28520 442 5917 ...
## $ test_xgb_reg: num 23643 21512 26985 8071 6221 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen'))

#### 3. Neural Networks
library(nnet) # simple feed-foward neural network
nnet_0<-nnet(formula = formula,
data=whole_data$train,
size=3,skip=T,
linout = TRUE)
## # weights: 127
## initial value 23290727608.253834
## iter 10 value 11001526686.852222
## iter 20 value 1619905849.222670
## iter 30 value 745089023.626550
## final value 741405918.436037
## converged
print(nnet_0)
## a 30-3-1 network with 127 weights
## inputs: fuel_gas aspiration_turbo doors_others doors_two body_others body_sedan body_wagon drive_others drive_rwd engine_loc_others wheel_base length width height weight engine_type_others cyl_others cyl_six engine_size fuel_sys_idi fuel_sys_mpfi fuel_sys_others bore stroke compr_ratio hp peak_rpm city_mpg high_mpg make_agg_toyota
## output(s): price
## options were - skip-layer connections linear output units
summary(nnet_0)
## a 30-3-1 network with 127 weights
## options were - skip-layer connections linear output units
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1
## -0.52 -0.21 -0.29 -0.23 -0.01 0.00 -0.52
## i7->h1 i8->h1 i9->h1 i10->h1 i11->h1 i12->h1 i13->h1
## -0.06 0.01 0.41 0.62 -0.66 -0.60 -0.66
## i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1 i20->h1
## 0.05 -0.66 -0.37 0.33 0.20 -0.26 -0.11
## i21->h1 i22->h1 i23->h1 i24->h1 i25->h1 i26->h1 i27->h1
## -0.41 -0.06 0.01 0.61 0.46 0.02 -0.22
## i28->h1 i29->h1 i30->h1
## -0.10 0.00 -0.37
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2
## -0.12 -0.16 -0.33 -0.26 0.70 -0.43 -0.65
## i7->h2 i8->h2 i9->h2 i10->h2 i11->h2 i12->h2 i13->h2
## 0.43 0.13 -0.57 -0.70 0.46 -0.28 0.19
## i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2 i20->h2
## -0.34 0.50 -0.69 -0.06 0.13 -0.62 -0.41
## i21->h2 i22->h2 i23->h2 i24->h2 i25->h2 i26->h2 i27->h2
## 0.29 0.23 -0.05 -0.58 -0.63 -0.31 0.60
## i28->h2 i29->h2 i30->h2
## 0.21 0.17 0.25
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3
## 0.52 -0.58 0.11 -0.42 0.40 -0.04 -0.66
## i7->h3 i8->h3 i9->h3 i10->h3 i11->h3 i12->h3 i13->h3
## 0.24 -0.06 -0.02 0.03 -0.62 -0.11 -0.49
## i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3 i20->h3
## 0.44 0.45 -0.43 0.44 0.44 0.03 -0.35
## i21->h3 i22->h3 i23->h3 i24->h3 i25->h3 i26->h3 i27->h3
## 0.04 0.54 -0.29 -0.25 -0.70 -0.45 0.05
## i28->h3 i29->h3 i30->h3
## -0.68 -0.42 -0.37
## b->o h1->o h2->o h3->o i1->o i2->o i3->o
## -4445.05 0.53 -4444.69 -4445.79 -11115.82 869.90 -2234.70
## i4->o i5->o i6->o i7->o i8->o i9->o i10->o
## 33.60 2750.63 589.78 -719.46 1150.13 -122.35 10490.01
## i11->o i12->o i13->o i14->o i15->o i16->o i17->o
## 141.57 -42.78 238.12 106.74 3.69 -4246.40 2838.74
## i18->o i19->o i20->o i21->o i22->o i23->o i24->o
## 4409.11 123.36 6669.83 1305.45 -213.10 -103.35 -5569.04
## i25->o i26->o i27->o i28->o i29->o i30->o
## -1208.01 -2.36 1.71 93.81 62.72 -1887.30
test_nnet<-predict(nnet_0, newdata = whole_data$test)
df_pred<-cbind(df_pred, test_nnet=test_nnet[,1])
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 9 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree : num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## $ test_lm : num 19378 19208 30497 2257 5859 ...
## $ test_glmnet : num 17943 19133 28520 442 5917 ...
## $ test_xgb_reg: num 23643 21512 26985 8071 6221 ...
## $ test_nnet : num 20025 19293 30642 1388 5636 ...
## - attr(*, ".internal.selfref")=<externalptr>
ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen','cornflowerblue'))

###############################
#### model evaluation
result<-data.table(method=c('tree','rf','xgb','lm','glmnet','xgb_reg','nnet'),
rmse=sapply(df_pred[,!c('price','id')],function(x) return(rmse(real=df_pred$price, predicted=x))),
mae=sapply(df_pred[,!c('price','id')],function(x) return(mae(real=df_pred$price, predicted=x))),
mape=sapply(df_pred[,!c('price','id')],function(x) return(mape(real=df_pred$price, predicted=x))))
result
## method rmse mae mape
## 1: tree 3928.491 2977.303 0.2239767
## 2: rf 2249.769 1588.936 0.1245050
## 3: xgb 3396.981 2212.590 0.1621892
## 4: lm 3205.921 2319.084 0.1995611
## 5: glmnet 3095.549 2127.579 0.1758675
## 6: xgb_reg 3001.003 2256.996 0.1656683
## 7: nnet 3229.050 2330.489 0.1993993
result[which.min(result$rmse)]
## method rmse mae mape
## 1: rf 2249.769 1588.936 0.124505
result[which.min(result$mae)]
## method rmse mae mape
## 1: rf 2249.769 1588.936 0.124505
result[which.min(result$mape)]
## method rmse mae mape
## 1: rf 2249.769 1588.936 0.124505
str(df_pred)
## Classes 'data.table' and 'data.frame': 39 obs. of 9 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 17450 18920 30760 5151 6377 ...
## $ test_tree : num 13944 20579 36964 7402 7402 ...
## $ test_rf : num 16786 18333 34747 6027 6002 ...
## $ test_xgb : num 18353 17226 39207 5644 5737 ...
## $ test_lm : num 19378 19208 30497 2257 5859 ...
## $ test_glmnet : num 17943 19133 28520 442 5917 ...
## $ test_xgb_reg: num 23643 21512 26985 8071 6221 ...
## $ test_nnet : num 20025 19293 30642 1388 5636 ...
## - attr(*, ".internal.selfref")=<externalptr>
p<-ggplot(melt(df_pred, id.vars = 'id'), aes(x=id,y=value, colour=variable))+
geom_point(alpha=0.65)+geom_line(alpha=0.65)+
ylim(0,50000)+xlab('')+ylab('$')+
ggtitle('Regression Tree - Test Prediction on Automobile Price')+
scale_colour_manual(values = c('black','red','blue','forestgreen','orange','gray','palegreen','cornflowerblue'))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(p)